# Deactivate scientific notation
options(scipen = 999, max.print = 10000)
# Seed
seed = 2828
# Load libraries
pacman::p_load(
kableExtra,
an9elproject,
tidyverse,
lubridate,
magrittr,
survival,
survminer,
car,
StepReg,
plotly,
install = FALSE, update = FALSE
)
# Load DB
# oncoth1 = get_project("oncothr1", version = "0.0.8004")
load("/mnt/ir-bioinf02/home/blobato/oncothromb01/oncothr1_versions/oncothr1.RData_0.0.8004")
oncoth1 = obj
# Get data slot
oncoth1_data = oncoth1$data
# Group tobacco use into 'Never smoked' vs 'Former/current'
oncoth1_data %<>%
mutate(tobacco_use_imp_grouped = case_when(
tobacco_use_imp == "Never has smoked" ~ "Never",
tobacco_use_imp == "Former smoker" ~ "Former/current",
tobacco_use_imp == "Active smoker" ~ "Former/current"),
.after = tobacco_use_imp)
# Create quartiles for age
oncoth1_data %<>%
mutate(age_quartiles = case_when(
age_when_cancer_dx <= 60 ~ "33-60",
age_when_cancer_dx > 60 & age_when_cancer_dx <= 69 ~ "61-69",
age_when_cancer_dx >= 70 ~ "70-93"
), .after = age_when_cancer_dx) %>%
mutate(age_quartiles = as.factor(age_quartiles))
This dataset must be in the proper format to deal with time-dependent
covariates and left- and right-censoring. The function
survival::tmerge() helps us with it.
# Get subset of covariates for survival analysis
covariates_df = oncoth1_data %>%
select(id,
patient_group,
age_when_cancer_dx,
age_quartiles,
age_when_cancer_dx_50,
age_when_cancer_dx_65,
gender,
ethnicity,
ethnicity_grouped,
menopausal_status_imp,
weight,
height,
body_surface_area,
bmi_value,
bmi_category,
performance_status_category_corrected_imp,
hormone_therapy,
major_surgery_in_medical_history,
pregnancy_imp,
oral_contraceptive_tx,
diabetes_mellitus,
dyslipidemia,
tobacco_use_imp,
tobacco_use_imp_grouped,
copd_imp,
arterial_hypertension,
chronic_cardiac_insufficiency,
renal_insufficiency,
khorana_risk_score_imp,
krs_category,
two_groups_krs,
tic_onco_imp,
previous_onco_surgery,
primary_tumor_simplified,
tumor_surgically_removed,
progression_according_to_clinical_stage_imp,
tnm_stage,
tnm_stage_detailed,
tnm_stage_grouped,
tnm_stage_two_groups,
t_stage_imp,
t_stage_grouped,
n_stage_imp,
n_stage_grouped,
m_stage,
m_stage_grouped,
histology_type_imp,
mucinous_histology_imp,
grade_histological_differentiation_imp,
n_metastases,
n_metastases_grouped,
metastasis_adrenal_glands,
metastasis_bones,
metastasis_brain,
metastasis_liver,
metastasis_lung,
metastasis_lymph_nodes,
metastasis_peritoneum,
metastasis_pleura,
metastasis_skin,
metastasis_soft_tissues,
other_metastases,
catheter_device_imp,
venous_insufficiency_imp,
n_comorbidities,
family_background_vte,
family_background_vte_n,
family_background_vte_all_relatives,
family_background_VTE_father,
family_background_VTE_mother,
family_background_VTE_siblings,
family_background_VTE_offspring,
family_background_VTE_other_relatives,
previous_vte,
previous_ate,
n_previous_ate,
previous_ate_type,
cva_ate,
myocardial_infarction_ate,
pad_ate,
tia_ate,
other_thromb_ate,
anticoag_tx_vte,
anticoag_tx_vte_drugs,
anticoag_tx_cardiopathy,
anticoag_tx_other_causes,
anticoag_tx_reason,
anticoag_tx_apixaban,
anticoag_tx_dabigatran,
anticoag_tx_lmwh,
anticoag_tx_lmwh_dosage,
anticoag_tx_vit_k_antag,
anticoag_tx_other_drugs,
anticoag_tx_currently,
n_antiaggreg_tx,
antiaggreg_tx_reason,
antiaggreg_tx_cardiopathy,
antiaggreg_tx_cva,
antiaggreg_tx_other_causes,
antiaggreg_tx_currently,
tu_1st_blood_transfusion_from_cancer_diagnosis_days,
tu_2nd_blood_transfusion_from_cancer_diagnosis_days,
tu_3rd_blood_transfusion_from_cancer_diagnosis_days,
tu_1st_major_trauma_from_cancer_diagnosis_days,
tu_1st_immobilisation_from_cancer_diagnosis_days,
tu_2nd_immobilisation_from_cancer_diagnosis_days,
tu_3rd_immobilisation_from_cancer_diagnosis_days,
tu_1st_recent_major_surgery_from_cancer_diagnosis_days,
tu_2nd_recent_major_surgery_from_cancer_diagnosis_days,
tu_3rd_recent_major_surgery_from_cancer_diagnosis_days,
tu_1st_thromboprophylaxis_hospital_from_cancer_diagnosis_days,
tu_1st_erythropoietin_tx_from_cancer_diagnosis_days,
tu_2nd_erythropoietin_tx_from_cancer_diagnosis_days,
tu_3rd_erythropoietin_tx_from_cancer_diagnosis_days,
vte_recurrence)
# Get subset of variables containing information about patient status and time until events
survival_data = oncoth1_data %>%
select(id,
patient_group,
date_cancer_dx,
date_study_begins,
date_study_ends,
date_death,
patient_status_at_end_study,
censored_patient,
follow_up_length_from_cancer_diagnosis_days,
follow_up_length_from_cancer_diagnosis_months,
tu_1st_vte_all_from_cancer_diagnosis_days_corrected,
tu_1st_vte_all_from_cancer_diagnosis_months_corrected,
tu_2nd_vte_all_from_cancer_diagnosis_days_corrected,
tu_2nd_vte_all_from_cancer_diagnosis_months_corrected,
tu_3rd_vte_all_from_cancer_diagnosis_days_corrected,
tu_3rd_vte_all_from_cancer_diagnosis_months_corrected,
tu_4th_vte_all_from_cancer_diagnosis_days_corrected,
tu_4th_vte_all_from_cancer_diagnosis_months_corrected
)
# Create object tmerge for managing time-dependant covariates
# Add time until death/censoring and time until VTE
# For interval censored data, the status indicator is
# 0=right censored
# 1=event at time
# 2=left censored
# 3=interval censored
survival_vte_tmerge = tmerge(
data1 = covariates_df,
data2 = survival_data,
id = id,
tstop = follow_up_length_from_cancer_diagnosis_days,
# event() is the survival analysis event of interest (in our case, death)
death = event(follow_up_length_from_cancer_diagnosis_days, censored_patient),
# tdc() are for time-dependent covariates
vte_event = tdc(tu_1st_vte_all_from_cancer_diagnosis_days_corrected),
vte_event = tdc(tu_2nd_vte_all_from_cancer_diagnosis_days_corrected),
vte_event = tdc(tu_3rd_vte_all_from_cancer_diagnosis_days_corrected),
vte_event = tdc(tu_4th_vte_all_from_cancer_diagnosis_days_corrected)
)
# Add other time-dependent covariates (i.e., blood transfusion, immobilisation, etc.)
survival_vte_tmerge = tmerge(
survival_vte_tmerge,
survival_vte_tmerge,
id = id,
blood_transfusion = tdc(tu_1st_blood_transfusion_from_cancer_diagnosis_days),
blood_transfusion = tdc(tu_2nd_blood_transfusion_from_cancer_diagnosis_days),
blood_transfusion = tdc(tu_3rd_blood_transfusion_from_cancer_diagnosis_days),
immobilisation = tdc(tu_1st_immobilisation_from_cancer_diagnosis_days),
immobilisation = tdc(tu_2nd_immobilisation_from_cancer_diagnosis_days),
immobilisation = tdc(tu_3rd_immobilisation_from_cancer_diagnosis_days),
recent_major_surgery = tdc(tu_1st_recent_major_surgery_from_cancer_diagnosis_days),
recent_major_surgery = tdc(tu_2nd_recent_major_surgery_from_cancer_diagnosis_days),
recent_major_surgery = tdc(tu_3rd_recent_major_surgery_from_cancer_diagnosis_days),
thromboprophylaxis_hospital = tdc(tu_1st_thromboprophylaxis_hospital_from_cancer_diagnosis_days),
erythropoietin_tx = tdc(tu_1st_erythropoietin_tx_from_cancer_diagnosis_days),
erythropoietin_tx = tdc(tu_2nd_erythropoietin_tx_from_cancer_diagnosis_days),
erythropoietin_tx = tdc(tu_3rd_erythropoietin_tx_from_cancer_diagnosis_days)
)
We will fit several survival curves using the Kaplan-Meier estimator.
# Fit Kaplan-Meier estimator
km_fit_vte = survfit(
Surv(time = tstart, time2 = tstop, event = death) ~
vte_event +
cluster(id), # ensures that the survival estimates account for the correlation within individuals, providing more robust results in the presence of repeated measurements
data = survival_vte_tmerge)
km_fit_vte
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~
## vte_event + cluster(id), data = survival_vte_tmerge)
##
## records n.max n.start events median 0.95LCL 0.95UCL
## vte_event=0 650 369 369 97 NA NA NA
## vte_event=1 190 50 21 52 274 177 391
This output contains several items:
km_summary = function(km_survfit) {
# Obtain summary from KM estimator
summary_fit = summary(km_survfit)
formatted_summary = data.frame(
time = summary_fit$time,
n_risk = summary_fit$n.risk,
n_event = summary_fit$n.event,
survival = summary_fit$surv,
std_error = summary_fit$std.err,
lower_95CI = summary_fit$lower,
upper_95CI = summary_fit$upper
)
# Show table with summary
formatted_summary %>%
kbl(caption = "Survival Analysis Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE) %>%
column_spec(ncol(formatted_summary))
}
# Show table with summary with all time points from Kaplan-Meier estimation
km_summary(km_fit_vte)
| time | n_risk | n_event | survival | std_error | lower_95CI | upper_95CI |
|---|---|---|---|---|---|---|
| 26.0 | 358 | 1 | 0.9972067 | 0.0027894 | 0.9917546 | 1.0000000 |
| 27.0 | 357 | 1 | 0.9944134 | 0.0039393 | 0.9867225 | 1.0000000 |
| 40.0 | 355 | 1 | 0.9916122 | 0.0048223 | 0.9822055 | 1.0000000 |
| 47.0 | 353 | 1 | 0.9888031 | 0.0055671 | 0.9779519 | 0.9997748 |
| 50.0 | 351 | 1 | 0.9859860 | 0.0062233 | 0.9738638 | 0.9982592 |
| 58.0 | 350 | 1 | 0.9831689 | 0.0068133 | 0.9699053 | 0.9966139 |
| 68.0 | 349 | 2 | 0.9775347 | 0.0078532 | 0.9622634 | 0.9930485 |
| 84.0 | 345 | 1 | 0.9747013 | 0.0083259 | 0.9585187 | 0.9911571 |
| 87.0 | 344 | 1 | 0.9718679 | 0.0087706 | 0.9548290 | 0.9892108 |
| 99.0 | 338 | 1 | 0.9689925 | 0.0092039 | 0.9511201 | 0.9872008 |
| 101.0 | 336 | 1 | 0.9661086 | 0.0096177 | 0.9474410 | 0.9851441 |
| 115.0 | 326 | 1 | 0.9631451 | 0.0100344 | 0.9436775 | 0.9830143 |
| 124.0 | 318 | 1 | 0.9601163 | 0.0104499 | 0.9398517 | 0.9808179 |
| 127.0 | 317 | 1 | 0.9570876 | 0.0108470 | 0.9360622 | 0.9785852 |
| 137.0 | 314 | 1 | 0.9540395 | 0.0112326 | 0.9322761 | 0.9763109 |
| 141.0 | 312 | 1 | 0.9509817 | 0.0116053 | 0.9285056 | 0.9740019 |
| 143.0 | 311 | 2 | 0.9448661 | 0.0123100 | 0.9210443 | 0.9693040 |
| 149.0 | 306 | 1 | 0.9417783 | 0.0126512 | 0.9173060 | 0.9669034 |
| 152.0 | 304 | 1 | 0.9386803 | 0.0129833 | 0.9135753 | 0.9644752 |
| 153.0 | 303 | 1 | 0.9355824 | 0.0133049 | 0.9098653 | 0.9620263 |
| 154.0 | 302 | 1 | 0.9324844 | 0.0136168 | 0.9061743 | 0.9595583 |
| 160.0 | 298 | 1 | 0.9293553 | 0.0139260 | 0.9024578 | 0.9570544 |
| 162.0 | 297 | 2 | 0.9230970 | 0.0145183 | 0.8950758 | 0.9519954 |
| 175.0 | 295 | 1 | 0.9199678 | 0.0148024 | 0.8914083 | 0.9494424 |
| 179.0 | 292 | 1 | 0.9168173 | 0.0150833 | 0.8877261 | 0.9468618 |
| 183.0 | 290 | 1 | 0.9136558 | 0.0153590 | 0.8840432 | 0.9442604 |
| 184.0 | 289 | 1 | 0.9104944 | 0.0156279 | 0.8803738 | 0.9416455 |
| 194.0 | 285 | 1 | 0.9072997 | 0.0158962 | 0.8766725 | 0.9389968 |
| 203.0 | 284 | 1 | 0.9041049 | 0.0161581 | 0.8729839 | 0.9363354 |
| 205.0 | 283 | 1 | 0.9009102 | 0.0164138 | 0.8693074 | 0.9336619 |
| 208.0 | 282 | 1 | 0.8977155 | 0.0166636 | 0.8656425 | 0.9309769 |
| 216.0 | 280 | 1 | 0.8945094 | 0.0169097 | 0.8619735 | 0.9282734 |
| 218.0 | 279 | 1 | 0.8913033 | 0.0171503 | 0.8583152 | 0.9255592 |
| 222.0 | 277 | 1 | 0.8880856 | 0.0173877 | 0.8546520 | 0.9228270 |
| 237.0 | 274 | 1 | 0.8848444 | 0.0176237 | 0.8509681 | 0.9200692 |
| 242.0 | 273 | 1 | 0.8816032 | 0.0178547 | 0.8472941 | 0.9173016 |
| 247.0 | 272 | 1 | 0.8783620 | 0.0180808 | 0.8436295 | 0.9145244 |
| 256.0 | 269 | 1 | 0.8750967 | 0.0183061 | 0.8399430 | 0.9117217 |
| 266.0 | 266 | 1 | 0.8718069 | 0.0185305 | 0.8362338 | 0.9088932 |
| 267.0 | 264 | 1 | 0.8685046 | 0.0187523 | 0.8325176 | 0.9060471 |
| 268.0 | 263 | 1 | 0.8652023 | 0.0189695 | 0.8288102 | 0.9031923 |
| 278.0 | 260 | 1 | 0.8618746 | 0.0191862 | 0.8250788 | 0.9003113 |
| 279.0 | 259 | 1 | 0.8585469 | 0.0193986 | 0.8213559 | 0.8974218 |
| 280.0 | 258 | 1 | 0.8552192 | 0.0196067 | 0.8176412 | 0.8945241 |
| 285.0 | 257 | 1 | 0.8518915 | 0.0198108 | 0.8139345 | 0.8916185 |
| 287.0 | 256 | 1 | 0.8485638 | 0.0200110 | 0.8102356 | 0.8887051 |
| 294.0 | 254 | 1 | 0.8452230 | 0.0202091 | 0.8065275 | 0.8857749 |
| 297.0 | 253 | 1 | 0.8418822 | 0.0204035 | 0.8028269 | 0.8828373 |
| 304.0 | 250 | 1 | 0.8385146 | 0.0205979 | 0.7990999 | 0.8798735 |
| 305.0 | 249 | 1 | 0.8351471 | 0.0207887 | 0.7953800 | 0.8769024 |
| 308.0 | 248 | 1 | 0.8317796 | 0.0209758 | 0.7916672 | 0.8739244 |
| 311.0 | 247 | 2 | 0.8250445 | 0.0213398 | 0.7842618 | 0.8679480 |
| 313.0 | 245 | 2 | 0.8183095 | 0.0216905 | 0.7768823 | 0.8619457 |
| 316.0 | 243 | 1 | 0.8149419 | 0.0218611 | 0.7732019 | 0.8589352 |
| 324.0 | 240 | 1 | 0.8115463 | 0.0220321 | 0.7694930 | 0.8558980 |
| 333.0 | 239 | 1 | 0.8081507 | 0.0222000 | 0.7657901 | 0.8528547 |
| 342.0 | 238 | 1 | 0.8047552 | 0.0223649 | 0.7620931 | 0.8498054 |
| 343.0 | 237 | 1 | 0.8013596 | 0.0225269 | 0.7584020 | 0.8467504 |
| 352.0 | 234 | 1 | 0.7979350 | 0.0226894 | 0.7546810 | 0.8436679 |
| 353.0 | 233 | 1 | 0.7945103 | 0.0228490 | 0.7509658 | 0.8405798 |
| 355.0 | 232 | 1 | 0.7910857 | 0.0230057 | 0.7472563 | 0.8374860 |
| 356.0 | 231 | 3 | 0.7808119 | 0.0234592 | 0.7361603 | 0.8281718 |
| 362.0 | 228 | 1 | 0.7773873 | 0.0236049 | 0.7324722 | 0.8250565 |
| 373.0 | 226 | 1 | 0.7739475 | 0.0237498 | 0.7287709 | 0.8219246 |
| 374.0 | 225 | 1 | 0.7705077 | 0.0238920 | 0.7250748 | 0.8187875 |
| 381.0 | 221 | 1 | 0.7670213 | 0.0240370 | 0.7213274 | 0.8156098 |
| 385.0 | 220 | 1 | 0.7635348 | 0.0241792 | 0.7175851 | 0.8124268 |
| 386.0 | 219 | 1 | 0.7600484 | 0.0243189 | 0.7138480 | 0.8092388 |
| 408.0 | 218 | 1 | 0.7565619 | 0.0244560 | 0.7101159 | 0.8060457 |
| 415.0 | 217 | 1 | 0.7530754 | 0.0245905 | 0.7063888 | 0.8028477 |
| 418.0 | 216 | 1 | 0.7495890 | 0.0247226 | 0.7026665 | 0.7996448 |
| 419.0 | 215 | 1 | 0.7461025 | 0.0248522 | 0.6989490 | 0.7964372 |
| 420.0 | 214 | 1 | 0.7426161 | 0.0249795 | 0.6952362 | 0.7932248 |
| 426.0 | 213 | 2 | 0.7356431 | 0.0252268 | 0.6878244 | 0.7867863 |
| 427.0 | 211 | 1 | 0.7321567 | 0.0253471 | 0.6841253 | 0.7835602 |
| 468.0 | 203 | 1 | 0.7285500 | 0.0254775 | 0.6802878 | 0.7802360 |
| 478.0 | 201 | 2 | 0.7213007 | 0.0257345 | 0.6725852 | 0.7735448 |
| 506.0 | 197 | 1 | 0.7176393 | 0.0258630 | 0.6686976 | 0.7701631 |
| 514.0 | 196 | 1 | 0.7139779 | 0.0259889 | 0.6648151 | 0.7667763 |
| 542.0 | 178 | 1 | 0.7099668 | 0.0261506 | 0.6605188 | 0.7631165 |
| 547.0 | 175 | 1 | 0.7059098 | 0.0263140 | 0.6561743 | 0.7594151 |
| 553.0 | 171 | 1 | 0.7017817 | 0.0264820 | 0.6517509 | 0.7556530 |
| 554.0 | 168 | 1 | 0.6976044 | 0.0266518 | 0.6472757 | 0.7518464 |
| 560.0 | 162 | 1 | 0.6932982 | 0.0268329 | 0.6426520 | 0.7479358 |
| 561.0 | 158 | 1 | 0.6889103 | 0.0270194 | 0.6379374 | 0.7439560 |
| 586.0 | 132 | 1 | 0.6836912 | 0.0273142 | 0.6321988 | 0.7393777 |
| 595.0 | 110 | 1 | 0.6774759 | 0.0277640 | 0.6251875 | 0.7341374 |
| 614.0 | 78 | 1 | 0.6687903 | 0.0287345 | 0.6147777 | 0.7275482 |
| 45.0 | 34 | 1 | 0.9705882 | 0.0289760 | 0.9154259 | 1.0000000 |
| 50.0 | 33 | 1 | 0.9411765 | 0.0403526 | 0.8653187 | 1.0000000 |
| 60.0 | 32 | 1 | 0.9117647 | 0.0486433 | 0.8212409 | 1.0000000 |
| 66.0 | 31 | 1 | 0.8823529 | 0.0552551 | 0.7804373 | 0.9975775 |
| 67.0 | 30 | 1 | 0.8529412 | 0.0607387 | 0.7418297 | 0.9806949 |
| 76.0 | 30 | 1 | 0.8245098 | 0.0649678 | 0.7065206 | 0.9622033 |
| 79.0 | 29 | 1 | 0.7960784 | 0.0686098 | 0.6723499 | 0.9425760 |
| 81.0 | 29 | 1 | 0.7686275 | 0.0713975 | 0.6406902 | 0.9221120 |
| 102.5 | 35 | 1 | 0.7466667 | 0.0741286 | 0.6146388 | 0.9070549 |
| 108.0 | 35 | 1 | 0.7253333 | 0.0745794 | 0.5929477 | 0.8872763 |
| 136.0 | 46 | 1 | 0.7095652 | 0.0755026 | 0.5759949 | 0.8741098 |
| 142.0 | 46 | 1 | 0.6941399 | 0.0747986 | 0.5619840 | 0.8573736 |
| 155.0 | 49 | 1 | 0.6799738 | 0.0752883 | 0.5473248 | 0.8447714 |
| 158.0 | 48 | 1 | 0.6658076 | 0.0744211 | 0.5348169 | 0.8288814 |
| 159.0 | 49 | 1 | 0.6522197 | 0.0746348 | 0.5211818 | 0.8162038 |
| 175.5 | 50 | 1 | 0.6391753 | 0.0750670 | 0.5077523 | 0.8046151 |
| 177.0 | 49 | 1 | 0.6261309 | 0.0752042 | 0.4947980 | 0.7923232 |
| 178.0 | 48 | 1 | 0.6130866 | 0.0741947 | 0.4836274 | 0.7771999 |
| 196.0 | 50 | 1 | 0.6008248 | 0.0745133 | 0.4711760 | 0.7661477 |
| 216.0 | 49 | 1 | 0.5885631 | 0.0746233 | 0.4590607 | 0.7545985 |
| 231.0 | 48 | 1 | 0.5763014 | 0.0743636 | 0.4475216 | 0.7421390 |
| 245.0 | 49 | 1 | 0.5645401 | 0.0732355 | 0.4377961 | 0.7279771 |
| 247.0 | 48 | 1 | 0.5527789 | 0.0721052 | 0.4280747 | 0.7138111 |
| 253.0 | 48 | 1 | 0.5412626 | 0.0709484 | 0.4186326 | 0.6998147 |
| 254.0 | 47 | 1 | 0.5297464 | 0.0697898 | 0.4091938 | 0.6858150 |
| 263.0 | 47 | 1 | 0.5184752 | 0.0695743 | 0.3985704 | 0.6744518 |
| 267.0 | 48 | 1 | 0.5076736 | 0.0697549 | 0.3878186 | 0.6645697 |
| 274.0 | 47 | 1 | 0.4968721 | 0.0692230 | 0.3781440 | 0.6528779 |
| 275.0 | 47 | 1 | 0.4863003 | 0.0693049 | 0.3677866 | 0.6430033 |
| 277.0 | 46 | 1 | 0.4757286 | 0.0689086 | 0.3581491 | 0.6319091 |
| 285.0 | 45 | 1 | 0.4651568 | 0.0676756 | 0.3497504 | 0.6186436 |
| 288.0 | 45 | 1 | 0.4548200 | 0.0664309 | 0.3415964 | 0.6055722 |
| 293.0 | 44 | 1 | 0.4444832 | 0.0651852 | 0.3334443 | 0.5924988 |
| 315.0 | 43 | 1 | 0.4341464 | 0.0646618 | 0.3242337 | 0.5813185 |
| 317.0 | 42 | 1 | 0.4238096 | 0.0646638 | 0.3142654 | 0.5715378 |
| 323.0 | 42 | 1 | 0.4137189 | 0.0633933 | 0.3063920 | 0.5586416 |
| 360.0 | 41 | 1 | 0.4036281 | 0.0621022 | 0.2985490 | 0.5456916 |
| 365.0 | 40 | 1 | 0.3935374 | 0.0622261 | 0.2886650 | 0.5365102 |
| 373.0 | 39 | 1 | 0.3834467 | 0.0623399 | 0.2788167 | 0.5273408 |
| 375.0 | 38 | 1 | 0.3733560 | 0.0610309 | 0.2710072 | 0.5143580 |
| 379.0 | 37 | 1 | 0.3632653 | 0.0597201 | 0.2632013 | 0.5013718 |
| 391.0 | 37 | 1 | 0.3534474 | 0.0593381 | 0.2543442 | 0.4911652 |
| 423.0 | 36 | 2 | 0.3338114 | 0.0589956 | 0.2360834 | 0.4719945 |
| 464.0 | 38 | 1 | 0.3250269 | 0.0580308 | 0.2290574 | 0.4612052 |
| 480.0 | 35 | 1 | 0.3157404 | 0.0565772 | 0.2222308 | 0.4485966 |
| 483.0 | 34 | 1 | 0.3064539 | 0.0565645 | 0.2134285 | 0.4400255 |
| 498.0 | 33 | 1 | 0.2971674 | 0.0551030 | 0.2066165 | 0.4274028 |
| 504.0 | 32 | 1 | 0.2878809 | 0.0543287 | 0.1988725 | 0.4167265 |
| 561.0 | 30 | 1 | 0.2782849 | 0.0533702 | 0.1910926 | 0.4052616 |
| 562.0 | 29 | 1 | 0.2686889 | 0.0534381 | 0.1819528 | 0.3967717 |
| 722.0 | 3 | 1 | 0.1791259 | 0.0821321 | 0.0729238 | 0.4399951 |
get_robscore_pval = function(data, covariates, tstart = "tstart", tstop = "tstop", event = "death", cluster = "id", strata = NULL) {
# Fit CPH model to extract robust score test p-value, since it is the best
# way of dealing with left- and right-censored data
# Construct formula
covariates = paste(covariates, collapse = " + ")
# Create formula for CPH model
if (!is.null(strata)) {
formula_string = paste0("Surv(", tstart, ", ", tstop, ", ", event, ") ~ ", covariates, " + strata(", strata, ") + cluster(", cluster, ")")
} else {
formula_string = paste0("Surv(", tstart, ", ", tstop, ", ", event, ") ~ ", covariates, " + cluster(", cluster, ")")
}
# Convert to formula object
formula_string = as.formula(formula_string)
# Train CPH model
cph_fit = coxph(formula_string,data = data)
# Get p-value from robust score test
cph_tests_summary = summary(cph_fit)
robscore_pval = cph_tests_summary$robscore[["pvalue"]]
return(robscore_pval)
}
# We perform the log-rank test separately, using coxph(), because it can handle left- and right-censored data
# Later, we need to manually add the p-value into the KM curves
km_pval = get_robscore_pval(data = survival_vte_tmerge, covariates = c("vte_event"))
# jpeg("../graphs/KM_curve_VTE.jpeg", width = 6.5, height = 4, units = 'in', res = 700)
km_plot_vte =
ggsurvplot(
fit = km_fit_vte,
data = survival_vte_tmerge,
xscale = 30,
break.time.by = 60,
censor = FALSE,
# pval = TRUE,
# pval.method = TRUE,
#pval.size = 4,
conf.int = TRUE,
conf.int.alpha = 0.15,
xlim = c(0, 540),
surv.plot.height = 1.25,
tables.height = 0.25,
legend.title = " ",
legend.labs = c("No VTE", "VTE"),
risk.table = "abs_pct",
risk.table.height = 0.3,
fontsize = 3,
risk.table.col = "strata",
risk.table.y.text.col = TRUE,
# linetype = "strata",
surv.median.line = "hv",
xlab = "Time to death (months)",
ylab = "Overall Survival Probability",
ggtheme = theme_bw(),
palette = c("black", "red")
)
# Add p-value
km_plot_vte$plot = km_plot_vte$plot +
annotate(
geom = "text",
x = 26,
y = 0.3,
label = "Log-rank",
color = "black") +
annotate(
geom = "text",
x = 28,
y = 0.2,
label = as.character(format(
km_pval,
scientific = TRUE,
digits = 1)),
color = "black")
# Customize risk table title
km_plot_vte$table =
km_plot_vte$table +
theme(plot.title = element_text(size = 10))
km_plot_vte
# Save plot
# ggsave(
# filename = "../graphs/KM_curve_VTE.jpeg",
# # width = 6.5,
# # height = 4,
# units = 'in',
# dpi = 700
# )
# dev.off()
VTE recurrence is also a time-dependent covariate.
# KM estimation
km_fit_vte_recurrence = survfit(Surv(
time = tstart,
time2 = tstop,
event = death) ~ vte_recurrence + cluster(id),
id = id,
data = survival_vte_tmerge)
km_fit_vte_recurrence
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~
## vte_recurrence + cluster(id), data = survival_vte_tmerge,
## id = id)
##
## records n events median 0.95LCL 0.95UCL
## vte_recurrence=No VTE 558 302 97 NA NA NA
## vte_recurrence=No VTE recurrence 233 77 43 483 373 NA
## vte_recurrence=VTE recurrence 49 11 9 293 254 NA
# Get ...
km_pval_vte_recurrence = get_robscore_pval(survival_vte_tmerge, covariates = c("vte_recurrence"))
# Plot KM curve of patients with no CAT, with a single CAT and CAT recurrence
km_plot_vte_recurrence =
ggsurvplot(
fit = km_fit_vte_recurrence,
data = survival_vte_tmerge,
xscale = 30,
break.time.by = 60,
censor = FALSE,
conf.int = FALSE,
xlim = c(0, 540),
# surv.plot.height = 1.25,
# tables.height = 0.25,
legend.title = " ",
legend.labs = c("No CAT", "CAT w/o recurrence", "CAT recurrence"),
risk.table = "abs_pct",
risk.table.height = 0.3,
fontsize = 3,
risk.table.col = "strata",
tables.y.text = FALSE,
#tables.y.text.col = TRUE,
risk.table.y.text.col = TRUE,
#linetype = "strata",
surv.median.line = "hv",
xlab = "Time (months)",
ylab = "OS Probability",
ggtheme = theme_bw(),
palette = c("black", "red", "#FF6600")
)
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
# Count number of patients of each category
n_vte_no_recurrence = sum(oncoth1_data$vte_recurrence == "No VTE recurrence")
n_vte_recurrence = sum(oncoth1_data$vte_recurrence == "VTE recurrence")
# Customize risk table title
km_plot_vte_recurrence$table =
km_plot_vte_recurrence$table +
theme(plot.title = element_text(size = 10), legend.position = "none")
# Add p-value and n of patient subgroups
km_plot_vte_recurrence$plot =
km_plot_vte_recurrence$plot +
# Log-rank p-value
annotate(
geom = "text",
x = 26,
y = 0.3,
label = "Log-rank",
color = "black") +
annotate(
geom = "text",
x = 28,
y = 0.2,
label = if (km_pval_vte_recurrence < 0.0001) {"< 0.0001"} else {km_pval_vte_recurrence},
color = "black") +
# n subgroups
annotate(
geom = "text",
size = 3.5,
x = 530,
y = 0.99,
label = "n=302",
color = "black") +
annotate(
geom = "text",
size = 3.5,
x = 530,
y = 0.91,
label = paste0("n=", n_vte_no_recurrence),
color = "red") +
annotate(
geom = "text",
size = 3.5,
x = 530,
y = 0.83,
label = paste0("n=", n_vte_recurrence),
color = "#FF6600")
km_plot_vte_recurrence
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
# # Save plot
# ggsave(
# filename = "../graphs/KM_curve_CAT_recurrence.jpeg",
# plot = km_plot_vte_recurrence$plot,
# width = 7,
# height = 4,
# units = "in"
# )
# dev.off()
# Save plot with table
# Combine the main plot and the cumulative events table
KM_curve_CAT_recurrence_combined_plot <- cowplot::plot_grid(
km_plot_vte_recurrence$plot,
km_plot_vte_recurrence$table,
ncol = 1,
rel_heights = c(1.5, 0.7) # Adjust heights to fit both plots
)
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
# Save plot
ggsave(
filename = "../graphs/KM_curve_CAT_recurrence.jpeg",
plot = KM_curve_CAT_recurrence_combined_plot,
width = 7,
height = 4,
units = "in"
)
# Train CPH model to assess weight of VTE recurrence
# Adjust by age, sex, cancer type and stage
cph_vte_recurrence = coxph(Surv(
time = tstart,
time2 = tstop,
event = death) ~
age_when_cancer_dx +
gender +
primary_tumor_simplified +
tnm_stage_grouped +
vte_event:vte_recurrence + # VTE recurrence only makes sense if there is VTE
cluster(id),
id = id,
data = survival_vte_tmerge
)
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 8 ; beta may be infinite.
# Test PH assumption
cox.zph(cph_vte_recurrence)
## chisq df p
## age_when_cancer_dx 0.379 1 0.54
## gender 0.392 1 0.53
## primary_tumor_simplified 6.123 3 0.11
## tnm_stage_grouped 2.937 2 0.23
## vte_event:vte_recurrence 4.371 3 0.22
## GLOBAL 16.356 10 0.09
# Inspect plot for violation of PH assumption
ggcoxzph(cox.zph(cph_vte_recurrence))
# Show betas and p-values
cph_vte_recurrence %>%
gtsummary::tbl_regression(exp = TRUE)
| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| age_when_cancer_dx | 1.02 | 1.00, 1.04 | 0.043 |
| gender | |||
| Male | — | — | |
| Female | 0.84 | 0.57, 1.23 | 0.4 |
| primary_tumor_simplified | |||
| Colorectal | — | — | |
| NSCLC | 3.07 | 1.85, 5.10 | <0.001 |
| Oesophago-gastric | 2.94 | 1.74, 4.96 | <0.001 |
| Pancreatic | 4.93 | 2.90, 8.38 | <0.001 |
| tnm_stage_grouped | |||
| I-II | — | — | |
| III | 2.79 | 1.27, 6.12 | 0.010 |
| IV | 9.17 | 4.45, 18.9 | <0.001 |
| vte_event * vte_recurrence | |||
| vte_event * No VTE | 0.00 | 0.00, 0.00 | <0.001 |
| vte_event * No VTE recurrence | 2.43 | 1.60, 3.69 | <0.001 |
| vte_event * VTE recurrence | 2.62 | 1.54, 4.47 | <0.001 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
# C-index
concordance(cph_vte_recurrence)
## Call:
## concordance.coxph(object = cph_vte_recurrence)
##
## n= 840
## Concordance= 0.7912 se= 0.0168
## concordant discordant tied.x tied.y tied.xy
## 35164 9264 48 18 0
# 18-month OS rate
summary(km_fit_vte_recurrence, times = 18*30.25)
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~
## vte_recurrence + cluster(id), data = survival_vte_tmerge,
## id = id)
##
## vte_recurrence=No VTE
## time n.risk n.event survival std.err lower 95% CI
## 544.5000 174.0000 89.0000 0.6944 0.0271 0.6433
## upper 95% CI
## 0.7496
##
## vte_recurrence=No VTE recurrence
## time n.risk n.event survival std.err lower 95% CI
## 544.500 32.000 40.000 0.465 0.058 0.364
## upper 95% CI
## 0.594
##
## vte_recurrence=VTE recurrence
## time n.risk n.event survival std.err lower 95% CI
## 544.5000 2.0000 9.0000 0.1818 0.1163 0.0519
## upper 95% CI
## 0.6369
Some pointers:
In a linear model (like the Cox regression), you generally should limit yourself to about 1 candidate predictor per 15 events in your dataset. https://stats.stackexchange.com/questions/178944/precisely-how-does-rs-coxph-handle-repeated-measures
One approach to dealing with a violation of the proportional hazards assumption is to stratify by that variable. Including a strata() term will result in a separate baseline hazard function being fit for each level in the stratification variable. It will be no longer possible to make direct inference on the effect associated with that variable.
Linear models do not work well when there is multicollinearity. Also, given the small sample size, we should limit our model to have a pool of features from which then apply automatic feature selection.
# Convert categorical variables into dummies (one-hot encoding)
survival_vte_tmerge_dummies = survival_vte_tmerge %>%
# Remove useless variables
select(-c(id, tstart, tstop, contains("tu_"))) %>%
# Convert KRS to numeric
mutate(khorana_risk_score_imp = as.numeric(khorana_risk_score_imp)) %>%
# Create dummies for all categorical variables (character and factors)
fastDummies::dummy_cols(., remove_first_dummy = TRUE, remove_selected_columns = TRUE)
# Variance per feature
variance_per_feature = sapply(survival_vte_tmerge_dummies, function (x) var(x, na.rm = TRUE))
variance_per_feature = variance_per_feature[order(variance_per_feature)]
# Threshold
variance_threshold = 0.1 # 0.05
# Variables with less than threshold variance, ordered from less to more
variance_per_feature[order(variance_per_feature[variance_per_feature < variance_threshold])]
## oral_contraceptive_tx_Yes
## 0.000000000
## anticoag_tx_apixaban_Yes
## 0.000000000
## anticoag_tx_dabigatran_Yes
## 0.000000000
## anticoag_tx_other_drugs_Yes
## 0.000000000
## ethnicity_Black/African American
## 0.001190476
## krs_category_NA
## 0.001190476
## metastasis_skin_Yes
## 0.001190476
## anticoag_tx_other_causes_Yes
## 0.001190476
## ethnicity_Sephardic Jew
## 0.002378115
## pregnancy_imp_Yes
## 0.002378115
## n_stage_imp_N3b
## 0.003562915
## n_metastases_5
## 0.003562915
## family_background_vte_all_relatives_Offspring
## 0.003562915
## family_background_vte_all_relatives_Others
## 0.003562915
## family_background_VTE_offspring_Yes
## 0.003562915
## n_previous_ate_3
## 0.003562915
## oral_contraceptive_tx_NA
## 0.004744878
## metastasis_soft_tissues_Yes
## 0.004744878
## family_background_vte_n_2
## 0.004744878
## family_background_vte_all_relatives_Two relatives
## 0.004744878
## previous_ate_type_Other
## 0.004744878
## other_thromb_ate_Yes
## 0.004744878
## anticoag_tx_cardiopathy_Yes
## 0.004744878
## anticoag_tx_vte_Yes
## 0.006992730
## family_background_vte_all_relatives_Siblings
## 0.007100289
## anticoag_tx_vit_k_antag_Yes
## 0.007100289
## family_background_VTE_other_relatives_Yes
## 0.007100289
## previous_ate_type_TIA
## 0.007100289
## tnm_stage_detailed_IIc
## 0.008273739
## family_background_VTE_siblings_Yes
## 0.008273739
## n_previous_ate_2
## 0.008273739
## n_antiaggreg_tx_2
## 0.008273739
## antiaggreg_tx_reason_Two ATE types
## 0.008273739
## anticoag_tx_lmwh_dosage_NA
## 0.009444350
## t_stage_imp_T1a
## 0.010612123
## anticoag_tx_lmwh_Yes
## 0.011140825
## tnm_stage_detailed_Ib
## 0.011777059
## previous_ate_type_Two or more
## 0.011777059
## anticoag_tx_reason_NA
## 0.011777059
## hormone_therapy_Yes
## 0.012939157
## t_stage_imp_T1b
## 0.012939157
## family_background_vte_all_relatives_Father
## 0.012939157
## major_surgery_in_medical_history_Yes
## 0.014098416
## metastasis_brain_Yes
## 0.014098416
## family_background_VTE_father_Yes
## 0.014098416
## tia_ate_Yes
## 0.015254839
## n_stage_imp_N1c
## 0.016408423
## anticoag_tx_vte_drugs_NA
## 0.016408423
## anticoag_tx_currently_Yes
## 0.016408423
## metastasis_adrenal_glands_Yes
## 0.017559169
## family_background_vte_all_relatives_Mother
## 0.017559169
## t_stage_imp_T2a
## 0.018707078
## family_background_VTE_mother_Yes
## 0.020994381
## previous_ate_type_PAD
## 0.020994381
## n_stage_imp_N2a
## 0.022133776
## previous_ate_type_CVA
## 0.022133776
## ethnicity_Latin-American
## 0.023270333
## n_metastases_4
## 0.023270333
## cva_ate_Yes
## 0.023270333
## antiaggreg_tx_reason_CVA
## 0.024404052
## chronic_cardiac_insufficiency_Yes
## 0.024404052
## ethnicity_grouped_Caucasian
## 0.026662977
## thromboprophylaxis_hospital
## 0.027788183
## t_stage_imp_T2b
## 0.027788183
## pad_ate_Yes
## 0.027788183
## antiaggreg_tx_cva_Yes
## 0.027788183
## n_stage_imp_N1b
## 0.028910551
## menopausal_status_imp_Peri-menopausal
## 0.031146773
## other_metastases_Yes
## 0.031146773
## previous_vte_Yes
## 0.031146773
## metastasis_pleura_Yes
## 0.032260628
## previous_ate_type_Myocardial infarction
## 0.033371644
## t_stage_imp_T1
## 0.035585164
## menopausal_status_imp_Pre-menopausal
## 0.037787332
## erythropoietin_tx
## 0.039978149
## body_surface_area
## 0.040845545
## family_background_vte_n_1
## 0.043243090
## myocardial_infarction_ate_Yes
## 0.043243090
## n_stage_imp_N1a
## 0.044325728
## family_background_vte_Yes
## 0.047556615
## antiaggreg_tx_reason_Cardiopathy
## 0.047556615
## tnm_stage_detailed_III
## 0.050761962
## n_metastases_3
## 0.050761962
## metastasis_bones_Yes
## 0.050761962
## renal_insufficiency_Yes
## 0.051824735
## n_stage_imp_N2b
## 0.052884670
## vte_recurrence_VTE recurrence
## 0.054996027
## antiaggreg_tx_cardiopathy_Yes
## 0.054996027
## tnm_stage_detailed_IIIc
## 0.058141779
## tnm_stage_detailed_IVb
## 0.058141779
## t_stage_grouped_T1
## 0.060871126
## t_stage_imp_T4b
## 0.062296385
## antiaggreg_tx_reason_Others
## 0.062296385
## antiaggreg_tx_other_causes_Yes
## 0.066405585
## tnm_stage_detailed_IIa
## 0.068443158
## performance_status_category_corrected_imp_2
## 0.070469380
## tnm_stage_detailed_IVa
## 0.072484250
## t_stage_imp_T2
## 0.077471763
## t_stage_grouped_NA
## 0.077471763
## tnm_stage_detailed_IIIa
## 0.078460753
## histology_type_imp_Epidermoid
## 0.080430217
## n_previous_ate_1
## 0.082388331
## tnm_stage_detailed_IIb
## 0.084335093
## m_stage_M1a
## 0.086270503
## previous_ate_Yes
## 0.092008627
## n_stage_grouped_N3
## 0.092687623
## age_when_cancer_dx_50_Older or 50 years old
## 0.094839378
## n_stage_grouped_NA
## 0.099500539
# Correlation matrix
oncoth1_correlations_vars = survival_vte_tmerge_dummies %>%
# Select relevant variables
select(
age_when_cancer_dx,
# `age_quartiles_61-69`,
# `age_quartiles_70-93`,
# `age_when_cancer_dx_50_Older or 50 years old`,
# `age_when_cancer_dx_65_Older or 65 years old`,
weight,
height,
body_surface_area,
bmi_value,
khorana_risk_score_imp,
death,
vte_event,
blood_transfusion,
immobilisation,
recent_major_surgery,
thromboprophylaxis_hospital,
erythropoietin_tx,
gender_Female,
`menopausal_status_imp_Pre-menopausal`,
`menopausal_status_imp_Peri-menopausal`,
`menopausal_status_imp_Post-menopausal`,
performance_status_category_corrected_imp_1,
performance_status_category_corrected_imp_2,
diabetes_mellitus_Yes,
dyslipidemia_Yes,
`tobacco_use_imp_Former smoker`,
`tobacco_use_imp_Active smoker`,
`tobacco_use_imp_grouped_Never`,
copd_imp_Yes,
arterial_hypertension_Yes,
chronic_cardiac_insufficiency_Yes,
renal_insufficiency_Yes,
previous_onco_surgery_Yes,
tumor_surgically_removed_Yes,
primary_tumor_simplified_NSCLC,
`primary_tumor_simplified_Oesophago-gastric`,
primary_tumor_simplified_Pancreatic,
tnm_stage_II,
tnm_stage_III,
tnm_stage_IV,
# metastasis_bones_Yes,
# metastasis_liver_Yes,
# metastasis_lung_Yes,
# metastasis_lymph_nodes_Yes,
# metastasis_peritoneum_Yes,
histology_type_imp_Adenocarcinoma,
histology_type_imp_Epidermoid,
mucinous_histology_imp_Yes,
`grade_histological_differentiation_imp_Moderately differentiated`,
`grade_histological_differentiation_imp_Poorly differentiated`,
catheter_device_imp_Yes,
venous_insufficiency_imp_Yes,
# n_comorbidities_1,
# `n_comorbidities_2+`,
family_background_vte_Yes,
previous_vte_Yes,
previous_ate_Yes,
anticoag_tx_currently_Yes,
antiaggreg_tx_currently_Yes
) %>%
cor(.)
# Heatmap and dendrogram
heatmap(oncoth1_correlations_vars)
# Interactive heatmap
plot_ly(
data = reshape2::melt(oncoth1_correlations_vars),
x = ~Var1,
y = ~Var2,
z = ~value,
type = "heatmap",
colors = colorRamp(c("blue", "white", "red")),
colorbar = list(title = "Correlation"),
showscale = TRUE
) %>%
layout(
title = "Interactive Heatmap of Correlations in ONCOTHROMB1",
xaxis = list(title = "Variables"),
yaxis = list(title = "Variables")
)
# Do correlation between CVC and tumor surgically removed with primary tumor
# Calculate Cramér's V
# Close to 0: Weak or no association.
# Around 0.1: Small association.
# Around 0.3: Moderate association.
# Around 0.5: Strong association.
# Close to 1: Very strong or perfect association.
# Primary tumor and CVC
contingency_table1 <- table(
oncoth1_data$primary_tumor_simplified,
oncoth1_data$catheter_device
)
cramers_v1 <- round(sqrt(chisq.test(contingency_table1)$statistic / sum(contingency_table1)), digits = 4)
print(paste("Cramer's V for cancer type and CVC =", cramers_v1))
## [1] "Cramer's V for cancer type and CVC = 0.4388"
# Primary tumor and tumor surgically removed
contingency_table2 <- table(
oncoth1_data$primary_tumor_simplified,
oncoth1_data$tumor_surgically_removed
)
cramers_v2 <- round(sqrt(chisq.test(contingency_table2)$statistic / sum(contingency_table2)), digits = 4)
print(paste("Cramer's V for cancer type and surgical resection of tumor =", cramers_v2))
## [1] "Cramer's V for cancer type and surgical resection of tumor = 0.3397"
# Family background of VTE and venous insufficiency
contingency_table3 <- table(
oncoth1_data$family_background_vte,
oncoth1_data$catheter_device
)
cramers_v3 <- round(sqrt(chisq.test(contingency_table3)$statistic / sum(contingency_table3)), digits = 4)
print(paste("Cramer's V for family background VTE and venous insufficiency =", cramers_v3))
## [1] "Cramer's V for family background VTE and venous insufficiency = 0.0076"
StepReg package vignette:
https://cran.r-project.org/web/packages/StepReg/vignettes/StepReg.html
# Create dataset
survival_vte_tmerge_stepwise = survival_vte_tmerge %>%
filter(!is.na(n_comorbidities))
# Do one-hot encoding with primary tumour
survival_vte_tmerge_stepwise = fastDummies::dummy_cols(survival_vte_tmerge_stepwise, select_columns = "primary_tumor_simplified")
# Rename with proper format
survival_vte_tmerge_stepwise %<>%
rename(primary_tumor_simplified_Oesophagogastric = `primary_tumor_simplified_Oesophago-gastric`)
See if there is consistency in variable selection and predictive capability of CV partitions, as well as with test set.
# Target distribution when event of interest is death
table(oncoth1_data$patient_status_at_end_study_simplified, useNA = "always")
##
## Alive Dead Unknown <NA>
## 236 149 5 0
# Partition data with repeated rows per patient into training and test set
# Set seed
set.seed(seed)
# Get unique IDs
unique_ids <- unique(survival_vte_tmerge_stepwise$id)
# Randomly assign each unique ID to either train or test
train_ids <- sample(unique_ids, size = floor(0.8 * length(unique_ids))) # 80% for training
test_ids <- setdiff(unique_ids, train_ids) # Remaining IDs for testing
# Partition the data into train and test sets based on the assigned IDs
stepwise_training_set <- survival_vte_tmerge_stepwise %>% filter(id %in% train_ids)
stepwise_testing_set <- survival_vte_tmerge_stepwise %>% filter(id %in% test_ids)
# Check that IDs are not in training and test sets
stopifnot(
identical(
sort(unique(stepwise_training_set$id)), sort(unique(stepwise_testing_set$id))) == FALSE)
# Create folds for k-fold CV
# Set seed
set.seed(seed)
# Number of folds
k = 10
# Create a grouping variable for each ID
group_var <- as.numeric(factor(stepwise_training_set$id))
# Determine the unique IDs and shuffle them randomly to ensure a balanced distribution across the folds
# Get unique IDs
folds_unique_ids <- unique(group_var)
# Shuffle the unique IDs randomly
folds_shuffled_ids <- sample(folds_unique_ids)
# Divide the shuffled IDs into folds
cv_folds_index <- cut(folds_shuffled_ids, breaks = k, labels = FALSE)
# Save IDs in file for reproducibility
run_datetime = Sys.time()
# Function to append data to the file
append2file = function(file_name = "./ids_partition_run.csv", data) {
write.table(
x = data,
file = file_name,
append = TRUE,
sep = "\t",
col.names = !file.exists(file_name),
row.names = FALSE,
quote = FALSE
)
}
# Set seed
set.seed(seed)
# Initialize a vector to store the evaluation results
stepwise_cv_index_valid <- vector("list", length = k)
stepwise_cv_vars_results <- vector("list", length = k)
stepwise_cv_cindex_results <- vector("list", length = k)
stepwise_cv_vif_results <- vector("list", length = k) # Inspection of GVIF in trained models is necessary to detect multicollinearity issues
for (i in 1:k) {
# Get the validation IDs
validation_ids <- unique_ids[cv_folds_index == i]
# Get the validation fold
validation_fold <- stepwise_training_set[stepwise_training_set$id %in% validation_ids, ]
# Get the training fold
training_fold <- stepwise_training_set[!(stepwise_training_set$id %in% validation_ids), ]
# print("Training")
# print(head(training_fold, n = k))
#
# print("Validation")
# print(head(validation_fold, n = k))
# Check that IDs are not in training and test sets
stopifnot(
identical(
sort(unique(training_fold$id)), sort(unique(validation_fold$id))) == FALSE)
# Train your model on the training fold
stepwise_model = StepReg::stepwiseCox(
formula = Surv(time = tstart, time2 = tstop, event = death) ~
vte_event +
age_when_cancer_dx +
gender +
bmi_value +
performance_status_category_corrected_imp + # Multicollinearity with cancer type, tobacco use, tumor resection in some folds, but models can be trained
# diabetes_mellitus + # Confounder. We know it is associated with pancreatic cancer patients that undergo resection of pancreas
dyslipidemia +
# tobacco_use_imp + # Correlated to NSCLC. Multicollinearity if PS is also selected
# copd_imp + # Correlated to tobacco use
arterial_hypertension +
# venous_insufficiency_imp + # remove as suggested by Andrés Muñoz (bad definition of variable, information may refer to different conditions)
# khorana_risk_score_imp + # Correlated to pancreatic and esophago-gastric cancers
# two_groups_krs + # Correlated to pancreatic and esophago-gastric cancers
# previous_onco_surgery + # Correlated to having had tumor resected
tumor_surgically_removed +
primary_tumor_simplified +
# tnm_stage_grouped +
tnm_stage_two_groups +
# histology_type_imp + # Correlated with NSCLC
mucinous_histology_imp +
# grade_histological_differentiation_imp + # violates PH assumption
# catheter_device_imp + # quite correlated with tumor type, as Cramer's V indicates
# family_background_vte + # High GVIF, multicollinearity with venous insufficiency in some folds. When selected, not clear if it is risk or protective factor, due to lack of information when recruiting patients
previous_vte +
previous_ate +
anticoag_tx_currently +
antiaggreg_tx_currently,
data = training_fold,
include = "primary_tumor_simplified",
selection = "bidirection",
select = "AIC",
method = "efron"
)
# Save selected variables
selected_variables = unname(unlist(stepwise_model$`Selected Varaibles`))
# print(selected_variables)
# Fit model with selected variables in this fold
# Create formula
fold_pcox_formula = as.formula(
sprintf('%s ~ %s', 'Surv(time = tstart, time2 = tstop, event = death)',
paste(selected_variables, collapse = " + ")))
# Fit model
fold_cph_model = coxph(
formula = fold_pcox_formula,
id = id,
data = validation_fold
)
# VIF
vif_results = rms::vif(fold_cph_model)
# print(vif_results)
# Make predictions on the validation fold
# The result is the hazard of suffering the event. It's difficult to interpret
predictions <- predict(fold_cph_model, newdata = validation_fold)
# print(predictions)
# Evaluate the predictions
# C-index
fold_cindex <- concordance(fold_cph_model, newdata = validation_fold)
# print(fold_cindex$concordance)
# Store the evaluation result
stepwise_cv_index_valid[[i]] = validation_ids
stepwise_cv_vars_results[[i]] = selected_variables
stepwise_cv_cindex_results[[i]] = fold_cindex$concordance
stepwise_cv_vif_results[[i]] = as.data.frame(vif_results)
}
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 8 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 6,8 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 7 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 5 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
# Append info to file for experiment's reproducibility
info_df = data.frame(
"Run" = run_datetime,
"Seed" = seed,
"Train_IDs" = paste(train_ids, collapse = ", "),
"Test_IDs" = paste(test_ids, collapse = ", "),
"IDs_folds_shuffled" = paste(folds_shuffled_ids, collapse = ", "),
"index_cv_folds" = paste(cv_folds_index, collapse = ", "),
"Validation_IDs" = paste(stepwise_cv_index_valid, collapse = ", ")
)
# Append to CSV file
append2file(data = info_df)
# Variable selection frequency across 10-fold CV
stepwise_var_selection_freq = as.data.frame(table(unlist(stepwise_cv_vars_results)))
stepwise_var_selection_freq %<>% arrange(desc(Freq))
knitr::kable(stepwise_var_selection_freq)
| Var1 | Freq |
|---|---|
| performance_status_category_corrected_imp | 10 |
| primary_tumor_simplified | 10 |
| tnm_stage_two_groups | 10 |
| tumor_surgically_removed | 10 |
| vte_event | 10 |
| mucinous_histology_imp | 6 |
| age_when_cancer_dx | 1 |
| previous_vte | 1 |
# Mean C-index across 10-fold CV
mean(unlist(stepwise_cv_cindex_results))
## [1] 0.86057
# Final model with most frequently selected variables
stepwise_cv_cph_final_model = coxph(
Surv(time = tstart,
time2 = tstop,
event = death) ~
cluster(id) +
performance_status_category_corrected_imp +
primary_tumor_simplified +
tnm_stage_two_groups +
tumor_surgically_removed +
vte_event +
mucinous_histology_imp, # after removing venous insufficiency
# vte_event +
# performance_status_category_corrected_imp +
# primary_tumor_simplified +
# tnm_stage_two_groups +
# tumor_surgically_removed +
# venous_insufficiency_imp +
# mucinous_histology_imp,
id = id,
data = stepwise_training_set
)
# Check proportional hazards assumption for Cox model
cox.zph(stepwise_cv_cph_final_model)
## chisq df p
## performance_status_category_corrected_imp 2.53248 2 0.28
## primary_tumor_simplified 3.48145 3 0.32
## tnm_stage_two_groups 0.00028 1 0.99
## tumor_surgically_removed 0.00873 1 0.93
## vte_event 0.10323 1 0.75
## mucinous_histology_imp 0.81547 1 0.37
## GLOBAL 5.79076 9 0.76
# Show HR and 95% CI
stepwise_cv_cph_final_model %>%
gtsummary::tbl_regression(exp = TRUE)
| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| performance_status_category_corrected_imp | |||
| 0 | — | — | |
| 1 | 1.10 | 0.71, 1.72 | 0.7 |
| 2 | 4.19 | 2.38, 7.37 | <0.001 |
| primary_tumor_simplified | |||
| Colorectal | — | — | |
| NSCLC | 1.91 | 1.13, 3.24 | 0.016 |
| Oesophago-gastric | 1.86 | 1.07, 3.23 | 0.028 |
| Pancreatic | 3.09 | 1.81, 5.26 | <0.001 |
| tnm_stage_two_groups | |||
| I-II-III | — | — | |
| IV | 2.74 | 1.81, 4.13 | <0.001 |
| tumor_surgically_removed | |||
| No | — | — | |
| Yes | 0.28 | 0.14, 0.56 | <0.001 |
| vte_event | 2.08 | 1.39, 3.11 | <0.001 |
| mucinous_histology_imp | |||
| No | — | — | |
| Yes | 1.40 | 0.94, 2.10 | 0.10 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
# Inspect VIF in final model
vif_results = car::vif(stepwise_cv_cph_final_model)
## Warning in vif.default(stepwise_cv_cph_final_model): No intercept: vifs may not
## be sensible.
vif_results
## GVIF Df GVIF^(1/(2*Df))
## performance_status_category_corrected_imp 1.260776 2 1.059643
## primary_tumor_simplified 1.407597 3 1.058635
## tnm_stage_two_groups 1.185783 1 1.088937
## tumor_surgically_removed 1.303690 1 1.141793
## vte_event 1.111697 1 1.054370
## mucinous_histology_imp 1.121929 1 1.059211
# C-index in testing set
concordance(stepwise_cv_cph_final_model, newdata = stepwise_testing_set)
## Call:
## concordance.coxph(object = stepwise_cv_cph_final_model, newdata = stepwise_testing_set)
##
## n= 155
## Concordance= 0.8973 se= 0.03567
## concordant discordant tied.x tied.y tied.xy
## 1038 110 20 0 0
# Akaike information criterion (AIC) value
AIC(stepwise_cv_cph_final_model)
## [1] 1276.862